import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import plotly.express as px
df = pd.read_csv('drinking_water_potability.csv')
df
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890456 | 20791.31898 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.05786 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.54173 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.41744 | 8.059332 | 356.886136 | 363.266516 | 18.436525 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.98634 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3271 | 4.668102 | 193.681736 | 47580.99160 | 7.166639 | 359.948574 | 526.424171 | 13.894419 | 66.687695 | 4.435821 | 1 |
| 3272 | 7.808856 | 193.553212 | 17329.80216 | 8.061362 | NaN | 392.449580 | 19.903225 | NaN | 2.798243 | 1 |
| 3273 | 9.419510 | 175.762646 | 33155.57822 | 7.350233 | NaN | 432.044783 | 11.039070 | 69.845400 | 3.298875 | 1 |
| 3274 | 5.126763 | 230.603758 | 11983.86938 | 6.303357 | NaN | 402.883113 | 11.168946 | 77.488213 | 4.708658 | 1 |
| 3275 | 7.874671 | 195.102299 | 17404.17706 | 7.509306 | NaN | 327.459761 | 16.140368 | 78.698446 | 2.309149 | 1 |
3276 rows × 10 columns
df["Missing"] = df.isna().any(axis=1)
dimensions = ["ph",
"Hardness",
"Solids",
"Chloramines",
"Sulfate",
"Conductivity",
"Organic_carbon",
"Trihalomethanes",
"Turbidity"]
# 1 : Suppression de toutes les lignes avec 2 ou 3 valeurs manquantes et imputation pour les lignes avec 1 valeur manquante
df_1 = df.dropna(thresh=10)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(df_1)
df_1 = pd.DataFrame(imp_mean.transform(df_1))
# 2 : Suppression des lignes avec 3 valeurs manquantes et imputation pour les autres
df_2 = df.dropna(thresh=9)
imp_mean.fit(df_2)
df_2 = pd.DataFrame(imp_mean.transform(df_2))
df_3 = df.dropna(thresh=10)
imp_iter = IterativeImputer(random_state=0)
df_3 = pd.DataFrame(imp_iter.fit_transform(df_3))
df_4 = df.dropna(thresh=10)
it_imp = IterativeImputer(sample_posterior=True)
df_4 = pd.DataFrame(it_imp.fit_transform(df_4))
#Dictionnaire qui contient les dataframes d'intérêt
all_df = {"df_1": df_1, "df_2": df_2,
"df_3": df_3, "df_4": df_4}
for dataframe in all_df.values():
#on redonne d'abord les noms des colonnes qui ont disparu
dataframe.columns = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability',
'Missing']
dataframe.drop('Missing', axis=1, inplace=True)
stats = pd.DataFrame()
def computeModelStats(model, stats, dataset, hyperParams, randomizedSearch=False):
"""
:param model: model that will be assessed
:type model: {tuple(predictor,name:str)}
:param stats: Dataframe in which results will be sent
:type stats: {Dataframe}
:param dataset: Dataset that will be used
:type dataset: {tuple(Dict(X:DataFrame,y:array), name)}
:param hyperParams: HyperParams grid that will be assessed
:type hyperParams: {Dict}
:return: void
:rtype: void
"""
X = dataset[0]["X"]
y = dataset[0]["y"]
if not randomizedSearch:
grid_search = GridSearchCV(model[0], cv=StratifiedKFold(n_splits=5), refit='f1',
scoring=['f1', 'recall', 'precision'], verbose=3, n_jobs=-1, param_grid=hyperParams,
return_train_score=True)
else:
grid_search = RandomizedSearchCV(model[0], cv=StratifiedKFold(n_splits=5), refit='f1',
scoring=['f1', 'recall', 'precision'], verbose=3, n_jobs=-1,
param_distributions=hyperParams, return_train_score=True,
n_iter=randomizedSearch)
grid_search.fit(X, y)
temp_test = pd.DataFrame({key.replace("_test", ""): grid_search.cv_results_[key] for key in
["mean_test_f1", "std_test_f1", "mean_test_recall", "mean_test_precision", "params"]})
temp_train = pd.DataFrame({key.replace("_train", ""): grid_search.cv_results_[key] for key in
["mean_train_f1", "std_train_f1", "mean_train_recall", "mean_train_precision",
"std_test_f1", "params"]})
temp_train["type"] = "train"
temp_test["type"] = "test"
temp = temp_test.append(temp_train)
temp["dataset"] = dataset[1]
temp["model"] = model[1]
temp["params"] = temp["params"].astype(str)
stats = stats.append(temp)
fig = px.bar(temp, y="mean_f1", hover_data=["params"], barmode="group", hover_name="params", color="type",
title=f"Mean f1 score on testing and training set for each parameter combination <br> (Model: {model[1]}, Dataset: {dataset[1]})",
labels={"mean_f1": f"Average f1", "index": "Params"})
fig.show()
return stats
def extract_features_and_labels(df):
print(df.columns)
y = df.Potability
X = df.drop("Potability", axis=1)
return {"X": X, "y": y}
mean_accurracy_3_missing_values = (
extract_features_and_labels(df_1), "Mean imputation for individuals with 2 MV")
mean_accurracy_2_3_missing_values = (
extract_features_and_labels(df_2), "Mean imputation for individuals with 2 or 3 MV")
multivariate_feature_imputation = (extract_features_and_labels(df_3), "Multivariate feature imputation")
multivariate_stochastic_feature_imputation = (
extract_features_and_labels(df_4), "Multivariate stochastic feature imputation")
multivariate_feature_imputation_subset_selection = (
extract_features_and_labels(df_3[["ph", "Hardness", "Chloramines", "Sulfate", "Potability"]]),
"Multivariate feature imputation with subset selection")
multivariate_stochastic_feature_imputation_subset_selection = (
extract_features_and_labels(df_4[["ph", "Hardness", "Chloramines", "Sulfate", "Potability"]]),
"Multivariate stochastic feature imputation with subset selection")
mean_accurracy_2_3_missing_values_subset_selection = (
extract_features_and_labels(df_2[["ph", "Hardness", "Chloramines", "Sulfate", "Potability"]]),
"Mean imputation subset selection for individuals with 2 or 3 MV")
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
dtype='object')
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
dtype='object')
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
dtype='object')
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
dtype='object')
Index(['ph', 'Hardness', 'Chloramines', 'Sulfate', 'Potability'], dtype='object')
Index(['ph', 'Hardness', 'Chloramines', 'Sulfate', 'Potability'], dtype='object')
Index(['ph', 'Hardness', 'Chloramines', 'Sulfate', 'Potability'], dtype='object')
We know usually we tend to end with random forest. However we wanted to test several datasets, given the ability of random forests to ability to match unscaled data it will help us to prune some dataset for the next models.
def random_forest():
return (RandomForestClassifier(), "RandomForest Classifier")
pgrid = {"max_depth": [3, 6, 9],
"n_estimators": [50, 80, 110, 130],
"min_samples_split": [3, 6, 9, 12],
"max_features": ["sqrt", "log2"]}
stats = computeModelStats(random_forest(), stats, mean_accurracy_3_missing_values, pgrid)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(random_forest(), stats, mean_accurracy_2_3_missing_values, pgrid)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(random_forest(), stats, mean_accurracy_2_3_missing_values_subset_selection, pgrid)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(random_forest(), stats, multivariate_feature_imputation, pgrid)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(random_forest(), stats, multivariate_stochastic_feature_imputation, pgrid)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(random_forest(), stats, multivariate_feature_imputation_subset_selection, pgrid)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(random_forest(), stats, multivariate_stochastic_feature_imputation_subset_selection, pgrid)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
def extract_rows_from_main_df(model=None, dataset=None, type="test"):
mask1 = stats["dataset"] == dataset[1] if dataset != None else True
mask2 = stats["model"] == model[1] if model != None else True
mask3 = stats["type"] == type
return stats[mask1 & mask2 & mask3]
def plot_params_incidence(model=None, dataset=None, params=None, type="test", barchart=False):
df_stats_temp = extract_rows_from_main_df(model=model, dataset=dataset, type=type)
df_stats_temp = df_stats_temp.join(pd.json_normalize(df_stats_temp["params"].apply(eval))).groupby([params]).agg(
{"mean_f1": "mean", "std_f1": "mean"})
if not barchart:
return px.line(df_stats_temp, labels="params", y="mean_f1",
title=f"Mean f1 score on testing depending on hyper parameter values <br>(Model: {model[1] if model else 'All'}, Dataset: {dataset[1] if dataset else 'All'}").show()
return px.bar(df_stats_temp, labels="params", y="mean_f1",
title=f"Mean f1 score on testing depending on hyper parameter values <br>(Model: {model[1] if model else 'All'}, Dataset: {dataset[1] if dataset else 'All'})",
error_y="std_f1").show()
def carriage_return_for_long_str(string, char="<br>"):
words = string.split(" ")
if len(words) < 6:
return string
words[int(len(words) / 2) - 1] += char
return " ".join(words)
def plot_mean_score_depending_on_dataset(model):
best_accurracy = extract_rows_from_main_df(model=model, dataset=None).groupby("dataset").agg(
{"mean_f1": "mean"})
best_accurracy["dataset"] = best_accurracy.index
best_accurracy["dataset"] = best_accurracy["dataset"].apply(carriage_return_for_long_str, )
fig = px.bar(best_accurracy, y="mean_f1", x="dataset", title=f"Average f1 score for each dataset with {model[1]}")
fig.update_xaxes(categoryorder="total descending")
fig.show()
plot_mean_score_depending_on_dataset(random_forest())
plot_params_incidence(model=random_forest(), params="max_depth", barchart=True)
plot_params_incidence(model=random_forest(), params="n_estimators", barchart=True)
plot_params_incidence(model=random_forest(), params="min_samples_split", barchart=True)
plot_params_incidence(model=random_forest(), params="max_features", barchart=True)
def SVM_with_scale():
return make_pipeline(StandardScaler(), SVC()), "SVM with scaled data"
print(SVM_with_scale())
svm_hyperparams = {
"svc__gamma": [1, .5, .1, .01, .001, ],
"svc__C": [1, 3, 5, 7, 9, 11, 13],
}
(Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())]), 'SVM with scaled data')
stats = computeModelStats(SVM_with_scale(), stats, multivariate_feature_imputation_subset_selection, svm_hyperparams)
Fitting 5 folds for each of 35 candidates, totalling 175 fits
stats = computeModelStats(SVM_with_scale(), stats, mean_accurracy_2_3_missing_values_subset_selection, svm_hyperparams)
Fitting 5 folds for each of 35 candidates, totalling 175 fits
stats = computeModelStats(SVM_with_scale(), stats, multivariate_stochastic_feature_imputation_subset_selection,
svm_hyperparams)
Fitting 5 folds for each of 35 candidates, totalling 175 fits
stats = computeModelStats(SVM_with_scale(), stats, multivariate_feature_imputation, svm_hyperparams)
Fitting 5 folds for each of 35 candidates, totalling 175 fits
stats = computeModelStats(SVM_with_scale(), stats, mean_accurracy_2_3_missing_values_subset_selection, svm_hyperparams)
Fitting 5 folds for each of 35 candidates, totalling 175 fits
plot_params_incidence(model=SVM_with_scale(), params="svc__C", barchart=True)
plot_params_incidence(model=SVM_with_scale(), params="svc__gamma", barchart=True)
plot_mean_score_depending_on_dataset(SVM_with_scale())
def extreme_random_forest():
return (RandomForestClassifier(), "ExtremeRandomForest Classifier")
extreme_random_forest_params = {"max_depth": [3, 6, 9],
"n_estimators": [50, 80, 110, 130],
"min_samples_split": [3, 6, 9, 12],
"max_features": ["sqrt", "log2"]}
stats = computeModelStats(extreme_random_forest(), stats, multivariate_feature_imputation_subset_selection,
extreme_random_forest_params)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(extreme_random_forest(), stats, multivariate_stochastic_feature_imputation_subset_selection,
extreme_random_forest_params)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(extreme_random_forest(), stats, mean_accurracy_2_3_missing_values_subset_selection,
extreme_random_forest_params)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(extreme_random_forest(), stats, multivariate_feature_imputation, extreme_random_forest_params)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
stats = computeModelStats(extreme_random_forest(), stats, mean_accurracy_2_3_missing_values_subset_selection,
extreme_random_forest_params)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
plot_params_incidence(extreme_random_forest(), None, list(extreme_random_forest_params.keys())[0], barchart=True)
plot_params_incidence(extreme_random_forest(), None, list(extreme_random_forest_params.keys())[1], barchart=True)
plot_params_incidence(extreme_random_forest(), None, list(extreme_random_forest_params.keys())[2], barchart=True)
plot_params_incidence(extreme_random_forest(), None, list(extreme_random_forest_params.keys())[3], barchart=True)
plot_mean_score_depending_on_dataset(extreme_random_forest())
def knn():
return make_pipeline(StandardScaler(), KNeighborsClassifier()), "KNN Classifier"
knn_params = {
"kneighborsclassifier__metric": ["euclidean", "manhattan", "chebyshev"],
"kneighborsclassifier__weights": ["uniform", "distance"],
"kneighborsclassifier__n_neighbors": [2, 4, 6, 8, 10, 12, 14, 16]
}
stats = computeModelStats(knn(), stats, multivariate_feature_imputation_subset_selection,
knn_params)
Fitting 5 folds for each of 48 candidates, totalling 240 fits
stats = computeModelStats(knn(),stats, multivariate_stochastic_feature_imputation_subset_selection, knn_params)
Fitting 5 folds for each of 48 candidates, totalling 240 fits
stats = computeModelStats(knn(), stats, mean_accurracy_2_3_missing_values_subset_selection,knn_params)
Fitting 5 folds for each of 48 candidates, totalling 240 fits
stats = computeModelStats(knn(),stats, multivariate_feature_imputation, knn_params)
Fitting 5 folds for each of 48 candidates, totalling 240 fits
stats = computeModelStats(knn(),stats, mean_accurracy_2_3_missing_values, knn_params)
Fitting 5 folds for each of 48 candidates, totalling 240 fits
plot_params_incidence(knn(), None, list(knn_params.keys())[0], barchart=True)
plot_params_incidence(knn(), None, list(knn_params.keys())[1], barchart=True)
plot_params_incidence(knn(), None, list(knn_params.keys())[2], barchart=True)
plot_mean_score_depending_on_dataset(knn())
def qda():
return make_pipeline(StandardScaler(),QuadraticDiscriminantAnalysis()), "Quadratic Discriminant Analysis"
qda_params = {
"quadraticdiscriminantanalysis__reg_param": [-1.2-1.1,1,0.9,-0.8,-0.7,-0.6,-0.5,-0.4,-0.3,-0.2,-0.1,0.1,0.2,0.3,0.4],
}
stats = computeModelStats(qda(), stats, multivariate_feature_imputation_subset_selection,
qda_params)
Fitting 5 folds for each of 15 candidates, totalling 75 fits
stats = computeModelStats(qda(), stats, multivariate_stochastic_feature_imputation_subset_selection,
qda_params)
Fitting 5 folds for each of 15 candidates, totalling 75 fits
stats = computeModelStats(qda(), stats, mean_accurracy_2_3_missing_values_subset_selection,
qda_params)
Fitting 5 folds for each of 15 candidates, totalling 75 fits
stats = computeModelStats(qda(), stats, multivariate_feature_imputation,
qda_params)
Fitting 5 folds for each of 15 candidates, totalling 75 fits
stats = computeModelStats(qda(), stats, mean_accurracy_2_3_missing_values,
qda_params)
Fitting 5 folds for each of 15 candidates, totalling 75 fits
plot_params_incidence(extreme_random_forest(), None, list(extreme_random_forest_params.keys())[0], barchart=True)
plot_mean_score_depending_on_dataset(extreme_random_forest())
from IPython.display import display
stats_validation_step = stats[stats["type"] == "test"]
mask = stats_validation_step.groupby(['model','dataset'])['mean_f1'].transform(max) == stats_validation_step['mean_f1']
best_scores_per_model_per_dataset = stats_validation_step[mask]
pd.set_option('display.max_colwidth', None)
best_scores_per_model_per_dataset
| mean_f1 | std_f1 | mean_recall | mean_precision | params | type | std_test_f1 | dataset | model | |
|---|---|---|---|---|---|---|---|---|---|
| 88 | 0.330597 | 0.042802 | 0.226885 | 0.614571 | {'max_depth': 9, 'max_features': 'log2', 'min_samples_split': 9, 'n_estimators': 50} | test | NaN | Mean imputation for individuals with 2 MV | RandomForest Classifier |
| 84 | 0.317940 | 0.031407 | 0.215365 | 0.625257 | {'max_depth': 9, 'max_features': 'log2', 'min_samples_split': 6, 'n_estimators': 50} | test | NaN | Mean imputation for individuals with 2 or 3 MV | RandomForest Classifier |
| 91 | 0.359545 | 0.043096 | 0.256857 | 0.614725 | {'max_depth': 9, 'max_features': 'log2', 'min_samples_split': 9, 'n_estimators': 130} | test | NaN | Mean imputation subset selection for individuals with 2 or 3 MV | RandomForest Classifier |
| 66 | 0.328135 | 0.019202 | 0.220314 | 0.653895 | {'max_depth': 9, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 110} | test | NaN | Multivariate feature imputation | RandomForest Classifier |
| 64 | 0.325869 | 0.032004 | 0.224389 | 0.603020 | {'max_depth': 9, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 50} | test | NaN | Multivariate stochastic feature imputation | RandomForest Classifier |
| 89 | 0.378667 | 0.043613 | 0.276009 | 0.607710 | {'max_depth': 9, 'max_features': 'log2', 'min_samples_split': 9, 'n_estimators': 80} | test | NaN | Multivariate feature imputation with subset selection | RandomForest Classifier |
| 64 | 0.353546 | 0.049712 | 0.258782 | 0.565040 | {'max_depth': 9, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 50} | test | NaN | Multivariate stochastic feature imputation with subset selection | RandomForest Classifier |
| 25 | 0.403425 | 0.030562 | 0.328404 | 0.528034 | {'svc__C': 11, 'svc__gamma': 1} | test | NaN | Multivariate feature imputation with subset selection | SVM with scaled data |
| 25 | 0.400461 | 0.028856 | 0.322635 | 0.531972 | {'svc__C': 11, 'svc__gamma': 1} | test | NaN | Mean imputation subset selection for individuals with 2 or 3 MV | SVM with scaled data |
| 30 | 0.395449 | 0.036918 | 0.331720 | 0.491092 | {'svc__C': 13, 'svc__gamma': 1} | test | NaN | Multivariate stochastic feature imputation with subset selection | SVM with scaled data |
| 27 | 0.446818 | 0.036860 | 0.378377 | 0.547360 | {'svc__C': 11, 'svc__gamma': 0.1} | test | NaN | Multivariate feature imputation | SVM with scaled data |
| 25 | 0.400461 | 0.028856 | 0.322635 | 0.531972 | {'svc__C': 11, 'svc__gamma': 1} | test | NaN | Mean imputation subset selection for individuals with 2 or 3 MV | SVM with scaled data |
| 76 | 0.386814 | 0.042238 | 0.284199 | 0.612373 | {'max_depth': 9, 'max_features': 'sqrt', 'min_samples_split': 12, 'n_estimators': 50} | test | NaN | Multivariate feature imputation with subset selection | ExtremeRandomForest Classifier |
| 95 | 0.353153 | 0.040155 | 0.256327 | 0.574762 | {'max_depth': 9, 'max_features': 'log2', 'min_samples_split': 12, 'n_estimators': 130} | test | NaN | Multivariate stochastic feature imputation with subset selection | ExtremeRandomForest Classifier |
| 88 | 0.332628 | 0.032305 | 0.226889 | 0.630297 | {'max_depth': 9, 'max_features': 'log2', 'min_samples_split': 9, 'n_estimators': 50} | test | NaN | Multivariate feature imputation | ExtremeRandomForest Classifier |
| 78 | 0.363751 | 0.043846 | 0.260794 | 0.615768 | {'max_depth': 9, 'max_features': 'sqrt', 'min_samples_split': 12, 'n_estimators': 110} | test | NaN | Mean imputation subset selection for individuals with 2 or 3 MV | ExtremeRandomForest Classifier |
| 1 | 0.448715 | 0.028879 | 0.436541 | 0.462133 | {'kneighborsclassifier__metric': 'euclidean', 'kneighborsclassifier__n_neighbors': 2, 'kneighborsclassifier__weights': 'distance'} | test | NaN | Multivariate feature imputation with subset selection | KNN Classifier |
| 33 | 0.432454 | 0.014650 | 0.420957 | 0.445139 | {'kneighborsclassifier__metric': 'chebyshev', 'kneighborsclassifier__n_neighbors': 2, 'kneighborsclassifier__weights': 'distance'} | test | NaN | Multivariate stochastic feature imputation with subset selection | KNN Classifier |
| 1 | 0.460492 | 0.027918 | 0.450300 | 0.471585 | {'kneighborsclassifier__metric': 'euclidean', 'kneighborsclassifier__n_neighbors': 2, 'kneighborsclassifier__weights': 'distance'} | test | NaN | Mean imputation subset selection for individuals with 2 or 3 MV | KNN Classifier |
| 1 | 0.449332 | 0.012543 | 0.441425 | 0.457542 | {'kneighborsclassifier__metric': 'euclidean', 'kneighborsclassifier__n_neighbors': 2, 'kneighborsclassifier__weights': 'distance'} | test | NaN | Multivariate feature imputation | KNN Classifier |
| 1 | 0.442600 | 0.014275 | 0.435374 | 0.450227 | {'kneighborsclassifier__metric': 'euclidean', 'kneighborsclassifier__n_neighbors': 2, 'kneighborsclassifier__weights': 'distance'} | test | NaN | Mean imputation for individuals with 2 or 3 MV | KNN Classifier |
| 0 | 0.491723 | 0.041723 | 0.460234 | 0.543717 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Multivariate feature imputation with subset selection | Quadratic Discriminant Analysis |
| 0 | 0.441553 | 0.047738 | 0.391459 | 0.518408 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Multivariate stochastic feature imputation with subset selection | Quadratic Discriminant Analysis |
| 0 | 0.467955 | 0.041384 | 0.418199 | 0.543271 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Mean imputation subset selection for individuals with 2 or 3 MV | Quadratic Discriminant Analysis |
| 0 | 0.495829 | 0.040500 | 0.488913 | 0.520432 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Multivariate feature imputation | Quadratic Discriminant Analysis |
| 0 | 0.469261 | 0.048546 | 0.437840 | 0.516327 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Mean imputation for individuals with 2 or 3 MV | Quadratic Discriminant Analysis |
| 3 | 0.491723 | 0.041723 | 0.460234 | 0.543717 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Multivariate feature imputation with subset selection | Quadratic Discriminant Analysis |
| 3 | 0.491723 | 0.041723 | 0.460234 | 0.543717 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Multivariate feature imputation with subset selection | Quadratic Discriminant Analysis |
| 3 | 0.441553 | 0.047738 | 0.391459 | 0.518408 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Multivariate stochastic feature imputation with subset selection | Quadratic Discriminant Analysis |
| 3 | 0.467955 | 0.041384 | 0.418199 | 0.543271 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Mean imputation subset selection for individuals with 2 or 3 MV | Quadratic Discriminant Analysis |
| 3 | 0.495829 | 0.040500 | 0.488913 | 0.520432 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Multivariate feature imputation | Quadratic Discriminant Analysis |
| 3 | 0.469261 | 0.048546 | 0.437840 | 0.516327 | {'quadraticdiscriminantanalysis__reg_param': -0.8} | test | NaN | Mean imputation for individuals with 2 or 3 MV | Quadratic Discriminant Analysis |
#df rotation
best_scores_per_model_per_dataset.pivot_table(index='model',columns='dataset', values='mean_f1').dropna(thresh=3,axis=1)
| dataset | Mean imputation for individuals with 2 or 3 MV | Mean imputation subset selection for individuals with 2 or 3 MV | Multivariate feature imputation | Multivariate feature imputation with subset selection | Multivariate stochastic feature imputation with subset selection |
|---|---|---|---|---|---|
| model | |||||
| ExtremeRandomForest Classifier | NaN | 0.363751 | 0.332628 | 0.386814 | 0.353153 |
| KNN Classifier | 0.442600 | 0.460492 | 0.449332 | 0.448715 | 0.432454 |
| Quadratic Discriminant Analysis | 0.469261 | 0.467955 | 0.495829 | 0.491723 | 0.441553 |
| RandomForest Classifier | 0.317940 | 0.359545 | 0.328135 | 0.378667 | 0.353546 |
| SVM with scaled data | NaN | 0.400461 | 0.446818 | 0.403425 | 0.395449 |